library(lubridate)
library(ggplot2)
library(dplyr)
library(plotly)
library(visdat)
library(tidyr)
library(data.table)
library(raster)
library(nnet)
library(purrr)
library(DataExplorer)
library(pscl)
library(tree)
library(rpart)
library(rpart.plot)
library(ISLR)
library(randomForest)
library(kableExtra)
library(broom)
library(rattle)
library(corrplot)
library(Metrics)
# read our data
survey_data_test <- read.csv("cozie_responses_and_physiological_data_test_public.csv",
sep = ",", header = TRUE)
# read our data
survey_data <- read.csv("cozie_responses_and_physiological_data_training.csv",
sep = ",", header = TRUE)
weather_rainfall_data <- read.csv("weather_rainfall.csv",
sep = ",", header = TRUE)
weather_wind_speed_data <- read.csv("weather_wind-speed.csv",
sep = ",", header = TRUE)
weather_wind_direction_data <- read.csv("weather_wind-direction.csv",
sep = ",", header = TRUE)
weather_stations_data <- read.csv("weather_stations.csv",
sep = ",", header = TRUE)
weather_temperature_data <- read.csv("weather_air-temperature.csv",
sep = ",", header = TRUE)
weather_humidity_data <- read.csv("weather_relative-humidity.csv",
sep = ",", header = TRUE)
clean_weather_data <- function(weather_data){
# remove rows that have only missing values and replace remaining NA values with average of row
avg_temps <-
rowMeans(subset(weather_data[rowSums(is.na(weather_data)) != ncol(weather_data),], select = c(-X)), na.rm = T)
weather_data <-
weather_data[rowSums(is.na(weather_data)) != ncol(weather_data),] %>%
mutate(across(where(is.numeric),
~ if_else(is.na(.), avg_temps, .)))
weather_data <- weather_data %>%
mutate_at(vars(colnames(weather_data)[colnames(weather_data) != "X"]), as.numeric)
return(weather_data)
}
weather_rainfall_data %>% plot_missing()
weather_rainfall_data_clean <-clean_weather_data(weather_rainfall_data)
weather_rainfall_data_clean %>% plot_missing()
weather_temperature_data %>% plot_missing()
weather_temperature_data_clean <-clean_weather_data(weather_temperature_data)
weather_temperature_data_clean %>% plot_missing()
weather_wind_direction_data %>% plot_missing()
weather_wind_direction_data_clean <-clean_weather_data(weather_wind_direction_data)
weather_wind_direction_data_clean %>% plot_missing()
weather_wind_speed_data %>% plot_missing()
weather_wind_speed_data_clean <-clean_weather_data(weather_wind_speed_data)
weather_wind_speed_data_clean %>% plot_missing()
weather_humidity_data %>% plot_missing()
weather_humidity_data_clean <-clean_weather_data(weather_humidity_data)
weather_humidity_data_clean %>% plot_missing()
weather_humidity_data_clean <- weather_humidity_data_clean %>% # convert time to time object
#mutate(date_time = ymd_hms(weather_humidity_data_clean$X))
mutate(date_time = ymd_hms(unlist(map(strsplit(weather_humidity_data_clean$X, split='+', fixed=TRUE), 1))))
weather_temperature_data_clean <- weather_temperature_data_clean %>% # convert time to time object
#mutate(date_time = ymd_hms(weather_temperature_data_clean$X))
mutate(date_time = ymd_hms(unlist(map(strsplit(weather_temperature_data_clean$X, split='+', fixed=TRUE), 1))))
weather_wind_direction_data_clean <- weather_wind_direction_data_clean %>% # convert time to time object
#mutate(date_time = ymd_hms(weather_wind_direction_data_clean$X))
mutate(date_time = ymd_hms(unlist(map(strsplit(weather_wind_direction_data_clean$X, split='+', fixed=TRUE), 1))))
weather_wind_speed_data_clean <- weather_wind_speed_data_clean %>% # convert time to time object
#mutate(date_time = ymd_hms(weather_wind_speed_data_clean$X))
mutate(date_time = ymd_hms(unlist(map(strsplit(weather_wind_speed_data_clean$X, split='+', fixed=TRUE), 1))))
weather_rainfall_data_clean <- weather_rainfall_data_clean %>% # convert time to time object
#mutate(date_time = ymd_hms(weather_rainfall_data_clean$X))
mutate(date_time = ymd_hms(unlist(map(strsplit(weather_rainfall_data_clean$X, split='+', fixed=TRUE), 1))))
inspect_weather_data <- function(weather_data, value_name){
weather_start_date <- min(weather_data$date_time)
weather_end_date <- max(weather_data$date_time)
weather_duration <- max(weather_data$date_time) - min(weather_data$date_time)
weather_frequency <- nrow(weather_data) / as.numeric(weather_duration)
print(paste('First day of measurement', weather_start_date))
print(paste('Last day of measurement', weather_end_date))
print(paste('Duration of measurement in days', weather_duration))
print(paste('Frequency of measurement per day', weather_frequency))
weather_plot <-
ggplot(data = as.data.frame(reshape2::melt(subset(weather_data, select = -c(X)), id="date_time")), aes(x = date_time, y = value, col = variable)) +
geom_line() +
theme_minimal() +
theme() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("Date") +
ylab(value_name)
return(weather_plot)
}
inspect_weather_data(weather_humidity_data_clean, "humidity %")
## [1] "First day of measurement 2022-10-10 00:01:00"
## [1] "Last day of measurement 2023-07-03 23:59:00"
## [1] "Duration of measurement in days 266.998611111111"
## [1] "Frequency of measurement per day 1384.83866437091"
inspect_weather_data(weather_rainfall_data_clean, "rainfall mm")
## [1] "First day of measurement 2022-10-10 00:05:00"
## [1] "Last day of measurement 2023-07-03 23:55:00"
## [1] "Duration of measurement in days 266.993055555556"
## [1] "Frequency of measurement per day 276.598954404765"
inspect_weather_data(weather_temperature_data_clean, "temperature °C")
## [1] "First day of measurement 2022-10-10 00:01:00"
## [1] "Last day of measurement 2023-07-03 23:59:00"
## [1] "Duration of measurement in days 266.998611111111"
## [1] "Frequency of measurement per day 1385.01094991131"
inspect_weather_data(weather_wind_direction_data_clean, "wind durection °")
## [1] "First day of measurement 2022-10-10 00:01:00"
## [1] "Last day of measurement 2023-07-03 23:59:00"
## [1] "Duration of measurement in days 266.998611111111"
## [1] "Frequency of measurement per day 1347.32910595665"
inspect_weather_data(weather_wind_speed_data_clean, "knots m/s")
## [1] "First day of measurement 2022-10-10 00:01:00"
## [1] "Last day of measurement 2023-07-03 23:59:00"
## [1] "Duration of measurement in days 266.998611111111"
## [1] "Frequency of measurement per day 1347.33285129448"
How many weather stations are there?
str(weather_stations_data)
## 'data.frame': 74 obs. of 5 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ id : chr "S07" "S08" "S100" "S102" ...
## $ name : chr "Lornie Road" "Upper Thomson Road" "Woodlands Road" "Semakau Landfill" ...
## $ latitude : num 1.34 1.37 1.42 1.19 1.44 ...
## $ longitude: num 104 104 104 104 104 ...
What are the variable names?
names(survey_data)
## [1] "time" "q_alone_group"
## [3] "q_earphones" "id_participant"
## [5] "ws_latitude" "q_location"
## [7] "q_location_office" "q_location_transport"
## [9] "ws_longitude" "q_noise_kind"
## [11] "q_noise_nearby" "q_thermal_preference"
## [13] "ws_timestamp_location" "ws_timestamp_start"
## [15] "ts_oxygen_saturation" "ts_resting_heart_rate"
## [17] "ts_stand_time" "ts_step_count"
## [19] "ts_walking_distance" "ws_survey_count"
## [21] "Footprint.Proportion" "Footprint.Mean"
## [23] "Footprint.Stdev" "Perimeter.Total"
## [25] "Perimeter.Mean" "Perimeter.Stdev"
## [27] "Complexity.Mean" "Complexity.Stdev"
## [29] "Building.Count" "PopSum"
## [31] "Men" "Women"
## [33] "Elderly" "Youth"
## [35] "Children" "Civic"
## [37] "Commercial" "Entertainment"
## [39] "Food" "Healthcare"
## [41] "Institutional" "Recreational"
## [43] "Social" "Green.View.Mean"
## [45] "Green.View.Stdev" "Sky.View.Mean"
## [47] "Sky.View.Stdev" "Building.View.Mean"
## [49] "Building.View.Stdev" "Road.View.Mean"
## [51] "Road.View.Stdev" "Visual.Complexity.Mean"
## [53] "Visual.Complexity.Stdev" "dT"
## [55] "q_activity_category_alone" "q_activity_category_group"
## [57] "ts_heart_rate" "ts_audio_exposure_environment"
## [59] "id_unique"
survey_data <- survey_data %>%
# convert time to time object
#mutate(date_time = ymd_hms(survey_data$time)) %>%
mutate(date_time = ymd_hms(unlist(map(strsplit(survey_data$time, split='.', fixed=TRUE), 1)))) %>%
# replace empty strings with NA
mutate(across(where(is.character), ~na_if(., "")))
head(survey_data)
str(survey_data)
## 'data.frame': 1149136 obs. of 60 variables:
## $ time : chr "2022-10-10 09:32:04.588000+0800" "2022-10-10 09:32:04.588000+0800" "2022-10-10 09:33:08.713000+0800" "2022-10-10 09:35:11.600000+0800" ...
## $ q_alone_group : chr NA NA NA NA ...
## $ q_earphones : chr NA NA NA NA ...
## $ id_participant : chr "xesh001" "xesh001" "xesh001" "xesh001" ...
## $ ws_latitude : num NA NA NA NA NA NA NA NA NA NA ...
## $ q_location : chr NA NA NA NA ...
## $ q_location_office : chr NA NA NA NA ...
## $ q_location_transport : chr NA NA NA NA ...
## $ ws_longitude : num NA NA NA NA NA NA NA NA NA NA ...
## $ q_noise_kind : chr NA NA NA NA ...
## $ q_noise_nearby : chr NA NA NA NA ...
## $ q_thermal_preference : chr NA NA NA NA ...
## $ ws_timestamp_location : chr NA NA NA NA ...
## $ ws_timestamp_start : chr NA NA NA NA ...
## $ ts_oxygen_saturation : num NA NA NA NA NA NA NA NA NA NA ...
## $ ts_resting_heart_rate : num 57 NA NA NA NA NA NA NA NA NA ...
## $ ts_stand_time : num NA NA NA NA NA NA NA NA NA NA ...
## $ ts_step_count : num NA NA NA NA NA NA NA NA NA NA ...
## $ ts_walking_distance : num NA NA NA NA NA NA NA NA NA NA ...
## $ ws_survey_count : num NA NA NA NA NA NA NA NA NA NA ...
## $ Footprint.Proportion : num NA NA NA NA NA NA NA NA NA NA ...
## $ Footprint.Mean : num NA NA NA NA NA NA NA NA NA NA ...
## $ Footprint.Stdev : num NA NA NA NA NA NA NA NA NA NA ...
## $ Perimeter.Total : num NA NA NA NA NA NA NA NA NA NA ...
## $ Perimeter.Mean : num NA NA NA NA NA NA NA NA NA NA ...
## $ Perimeter.Stdev : num NA NA NA NA NA NA NA NA NA NA ...
## $ Complexity.Mean : num NA NA NA NA NA NA NA NA NA NA ...
## $ Complexity.Stdev : num NA NA NA NA NA NA NA NA NA NA ...
## $ Building.Count : num NA NA NA NA NA NA NA NA NA NA ...
## $ PopSum : num NA NA NA NA NA NA NA NA NA NA ...
## $ Men : num NA NA NA NA NA NA NA NA NA NA ...
## $ Women : num NA NA NA NA NA NA NA NA NA NA ...
## $ Elderly : num NA NA NA NA NA NA NA NA NA NA ...
## $ Youth : num NA NA NA NA NA NA NA NA NA NA ...
## $ Children : num NA NA NA NA NA NA NA NA NA NA ...
## $ Civic : num NA NA NA NA NA NA NA NA NA NA ...
## $ Commercial : num NA NA NA NA NA NA NA NA NA NA ...
## $ Entertainment : num NA NA NA NA NA NA NA NA NA NA ...
## $ Food : num NA NA NA NA NA NA NA NA NA NA ...
## $ Healthcare : num NA NA NA NA NA NA NA NA NA NA ...
## $ Institutional : num NA NA NA NA NA NA NA NA NA NA ...
## $ Recreational : num NA NA NA NA NA NA NA NA NA NA ...
## $ Social : num NA NA NA NA NA NA NA NA NA NA ...
## $ Green.View.Mean : num NA NA NA NA NA NA NA NA NA NA ...
## $ Green.View.Stdev : num NA NA NA NA NA NA NA NA NA NA ...
## $ Sky.View.Mean : num NA NA NA NA NA NA NA NA NA NA ...
## $ Sky.View.Stdev : num NA NA NA NA NA NA NA NA NA NA ...
## $ Building.View.Mean : num NA NA NA NA NA NA NA NA NA NA ...
## $ Building.View.Stdev : num NA NA NA NA NA NA NA NA NA NA ...
## $ Road.View.Mean : num NA NA NA NA NA NA NA NA NA NA ...
## $ Road.View.Stdev : num NA NA NA NA NA NA NA NA NA NA ...
## $ Visual.Complexity.Mean : num NA NA NA NA NA NA NA NA NA NA ...
## $ Visual.Complexity.Stdev : num NA NA NA NA NA NA NA NA NA NA ...
## $ dT : num NA NA NA NA NA NA NA NA NA NA ...
## $ q_activity_category_alone : chr NA NA NA NA ...
## $ q_activity_category_group : chr NA NA NA NA ...
## $ ts_heart_rate : num NA 83 59 61 63 67 74 76 78 90 ...
## $ ts_audio_exposure_environment: num NA NA NA NA NA NA NA NA NA NA ...
## $ id_unique : int 1 2 3 4 5 6 7 8 9 10 ...
## $ date_time : POSIXct, format: "2022-10-10 09:32:04" "2022-10-10 09:32:04" ...
How many users are in the study? How many data points for each user?
nrow(table(survey_data$id_participant))
## [1] 98
ggplot(survey_data[!is.na(survey_data$id_participant),],aes(id_participant)) +
geom_bar(stat = "count", position = "dodge") +
xlab("Participant Index") +
ylab("Number of Responses") +
scale_x_discrete(labels=seq(1,nrow(table(survey_data$id_participant)))) +
theme_minimal()
#
# ggsave("Number_of_Responses_per_partisipant.png",
# width = 18, height = 6, dpi = 200, units = "in", device='png')
i1 <- which(!is.na(survey_data$q_thermal_preference))
logs <- data.frame(log_row_index = rownames(survey_data[i1,]),
date_time = survey_data[i1,]$date_time)
grouped_logs <- merge(survey_data, logs, by = "date_time", all.x = TRUE) %>%
arrange(date_time) %>% # sort by time
fill(log_row_index, .direction = "down") %>% # fill group index
fill(log_row_index, .direction = "up") %>% # fill group index
group_by(id_participant) %>% # group by participant
arrange(id_participant, date_time) %>% #sort by user id then by time
group_by(log_row_index) # group by log group
nrow(grouped_logs)
## [1] 1149184
avg_heart_rate_past_10 <- grouped_logs %>% # group by log interval
summarize(average_heart_rate = mean(tail(na.omit(ts_heart_rate)), n = 10))
nrow(avg_heart_rate_past_10)
## [1] 4900
total_dist_past_10 <- grouped_logs %>% # group by log interval
summarize(dist_walked = sum(tail(na.omit(ts_walking_distance)), n = 10))
# take mean of last 10 non NaN walked distance measures
nrow(total_dist_past_10)
## [1] 4900
activity_data <- left_join(avg_heart_rate_past_10,
total_dist_past_10,
by = "log_row_index",
keep = FALSE)
head(activity_data)
activity_data_full <- survey_data[activity_data$log_row_index,]
activity_data_full$log_row_index <- activity_data$log_row_index
activity_data_full <- left_join(activity_data, activity_data_full, by = "log_row_index")
head(activity_data_full)
humidity_stations_ids <- colnames(weather_humidity_data_clean)[colnames(weather_humidity_data_clean) %in% weather_stations_data$id]
humidity_stations <- weather_stations_data[weather_stations_data$id %in% humidity_stations_ids, ]
d <- pointDistance(activity_data_full[,c("ws_longitude", "ws_latitude")],
humidity_stations[,c("longitude", "latitude")],
lonlat=TRUE, allpairs=T)
i <- apply(d, 1, which.min)
activity_data_full$humidity_ID = humidity_stations$id[i]
rainfall_stations_ids <- colnames(weather_rainfall_data_clean)[colnames(weather_rainfall_data_clean) %in% weather_stations_data$id]
rainfall_stations <- weather_stations_data[weather_stations_data$id %in% rainfall_stations_ids, ]
d <- pointDistance(activity_data_full[,c("ws_longitude", "ws_latitude")],
rainfall_stations[,c("longitude", "latitude")],
lonlat=TRUE, allpairs=T)
i <- apply(d, 1, which.min)
activity_data_full$rainfall_ID = rainfall_stations$id[i]
temperature_stations_ids <- colnames(weather_temperature_data_clean)[colnames(weather_temperature_data_clean) %in% weather_stations_data$id]
temperature_stations <- weather_stations_data[weather_stations_data$id %in% temperature_stations_ids, ]
d <- pointDistance(activity_data_full[,c("ws_longitude", "ws_latitude")],
temperature_stations[,c("longitude", "latitude")],
lonlat=TRUE, allpairs=T)
i <- apply(d, 1, which.min)
activity_data_full$temperature_ID = temperature_stations$id[i]
wind_speed_stations_ids <- colnames(weather_wind_speed_data_clean)[colnames(weather_wind_speed_data_clean) %in% weather_stations_data$id]
wind_speed_stations <- weather_stations_data[weather_stations_data$id %in% wind_speed_stations_ids, ]
d <- pointDistance(activity_data_full[,c("ws_longitude", "ws_latitude")],
wind_speed_stations[,c("longitude", "latitude")],
lonlat=TRUE, allpairs=T)
i <- apply(d, 1, which.min)
activity_data_full$wind_speed_ID = wind_speed_stations$id[i]
wind_direction_stations_ids <- colnames(weather_wind_direction_data_clean)[colnames(weather_wind_direction_data_clean) %in% weather_stations_data$id]
wind_direction_stations <- weather_stations_data[weather_stations_data$id %in% wind_direction_stations_ids, ]
d <- pointDistance(activity_data_full[,c("ws_longitude", "ws_latitude")],
wind_direction_stations[,c("longitude", "latitude")],
lonlat=TRUE, allpairs=T)
i <- apply(d, 1, which.min)
activity_data_full$wind_direction_ID = wind_direction_stations$id[i]
head(activity_data_full)
str(weather_humidity_data_clean[,c(humidity_stations_ids,'date_time')])
## 'data.frame': 369750 obs. of 17 variables:
## $ S24 : num 78.2 77.7 76.6 76.3 75.9 75.4 75.6 75.1 75.7 77.1 ...
## $ S43 : num 82.2 81.2 82.5 82.6 82.8 ...
## $ S44 : num 81.4 81.4 81.6 82.1 82.6 82.5 82.5 82.4 82.4 82.3 ...
## $ S50 : num 83.3 83.4 83.4 83.3 83.2 83 83 83.2 83.3 83.7 ...
## $ S60 : num 67.1 67.3 67.8 68.4 67.9 67.7 67.6 67.3 66.7 66.5 ...
## $ S100 : num 84.8 84.9 85 85.1 85.1 85.2 85.4 85.4 85.4 85.5 ...
## $ S102 : num 81.3 81.2 81.3 81.2 81.2 ...
## $ S104 : num 84.5 84.5 84.5 84.4 84.3 84.6 84.6 84.5 84.4 84.2 ...
## $ S106 : num 89.5 89.4 89.3 89.3 89.2 89.1 89.2 89.3 89.2 89.2 ...
## $ S107 : num 75.1 75.8 76.4 75.8 76.3 75.5 75.6 76.4 76.6 78 ...
## $ S108 : num 87.2 87 87.2 87.6 87.9 88.2 88.5 88.7 88.9 89.1 ...
## $ S109 : num 82.5 82.6 82.5 82.3 82.1 81.9 81.6 81.5 81.5 81.5 ...
## $ S111 : num 78.9 78.8 78.3 78.3 78.5 78.3 78.3 77.8 77.9 77.9 ...
## $ S115 : num 71 70.9 71.5 72 71.6 71.7 72 72.1 72.4 72.5 ...
## $ S116 : num 87.1 87.2 86.5 85.5 85.2 85.4 85.8 87 87.8 88.4 ...
## $ S121 : num 86.2 86 86 85.7 85.8 85.6 85.4 85.6 86 86.1 ...
## $ date_time: POSIXct, format: "2022-10-10 00:01:00" "2022-10-10 00:02:00" ...
melted_humidity_data <- reshape2::melt(weather_humidity_data_clean[,c(humidity_stations_ids,'date_time')], id='date_time')
colnames(melted_humidity_data)[colnames(melted_humidity_data) == 'variable'] <- 'humidity_ID'
colnames(melted_humidity_data)[colnames(melted_humidity_data) == 'value'] <- 'humidity'
setDT(melted_humidity_data)
melted_rainfall_data <- reshape2::melt(weather_rainfall_data_clean[,c(rainfall_stations_ids,'date_time')], id='date_time')
colnames(melted_rainfall_data)[colnames(melted_rainfall_data) == 'variable'] <- 'rainfall_ID'
colnames(melted_rainfall_data)[colnames(melted_rainfall_data) == 'value'] <- 'rainfall'
setDT(melted_rainfall_data)
melted_temperature_data <- reshape2::melt(weather_temperature_data_clean[,c(temperature_stations_ids,'date_time')], id='date_time')
colnames(melted_temperature_data)[colnames(melted_temperature_data) == 'variable'] <- 'temperature_ID'
colnames(melted_temperature_data)[colnames(melted_temperature_data) == 'value'] <- 'temperature'
setDT(melted_temperature_data)
melted_wind_speed_data <- reshape2::melt(weather_wind_speed_data_clean[,c(wind_speed_stations_ids,'date_time')], id='date_time')
colnames(melted_wind_speed_data)[colnames(melted_wind_speed_data) == 'variable'] <- 'wind_speed_ID'
colnames(melted_wind_speed_data)[colnames(melted_wind_speed_data) == 'value'] <- 'wind_speed'
setDT(melted_wind_speed_data)
melted_wind_direction_data <- reshape2::melt(weather_wind_direction_data_clean[,c(wind_direction_stations_ids,'date_time')], id='date_time')
colnames(melted_wind_direction_data)[colnames(melted_wind_direction_data) == 'variable'] <- 'wind_direction_ID'
colnames(melted_wind_direction_data)[colnames(melted_wind_direction_data) == 'value'] <- 'wind_direction'
setDT(melted_wind_direction_data)
setDT(activity_data_full)
activity_data_full <- activity_data_full[, c("humidityTime", "humidity") :=
melted_humidity_data[activity_data_full, on = c("humidity_ID", "date_time"), roll = Inf, .(x.date_time, x.humidity)]][]
activity_data_full <- activity_data_full[, c("rainfallTime", "rainfall") :=
melted_rainfall_data[activity_data_full, on = c("rainfall_ID", "date_time"), roll = Inf, .(x.date_time, x.rainfall)]][]
activity_data_full <- activity_data_full[, c("temperatureTime", "temperature") :=
melted_temperature_data[activity_data_full, on = c("temperature_ID", "date_time"), roll = Inf, .(x.date_time, x.temperature)]][]
activity_data_full <- activity_data_full[, c("wind_speedTime", "wind_speed") :=
melted_wind_speed_data[activity_data_full, on = c("wind_speed_ID", "date_time"), roll = Inf, .(x.date_time, x.wind_speed)]][]
activity_data_full <- activity_data_full[, c("wind_directionTime", "wind_direction") :=
melted_wind_direction_data[activity_data_full, on = c("wind_direction_ID", "date_time"), roll = Inf, .(x.date_time, x.wind_direction)]][]
head(activity_data_full, 100)
selected_data <- activity_data_full[,c(
'id_participant',
'ws_longitude',
'ws_latitude',
'dist_walked',
'average_heart_rate',
'q_location',
'Green.View.Mean',
'Footprint.Mean',
'Perimeter.Mean',
'Building.Count',
'Sky.View.Mean',
'Building.View.Mean',
'Road.View.Mean',
'humidity',
'rainfall',
'temperature',
'wind_speed',
'wind_direction',
'q_thermal_preference',
'date_time',
'dT',
'Visual.Complexity.Mean'
)]
selected_data <- drop_na(selected_data)
selected_data$q_location <- as.factor(selected_data$q_location)
selected_data$q_thermal_preference <-
as.factor(selected_data$q_thermal_preference)
summary(selected_data)
## id_participant ws_longitude ws_latitude dist_walked
## Length:4379 Min. :103.6 Min. :1.246 Min. : 10.0
## Class :character 1st Qu.:103.8 1st Qu.:1.297 1st Qu.: 107.7
## Mode :character Median :103.8 Median :1.319 Median : 227.9
## Mean :103.8 Mean :1.332 Mean : 342.6
## 3rd Qu.:103.8 3rd Qu.:1.355 3rd Qu.: 472.1
## Max. :104.0 Max. :1.454 Max. :4173.2
## average_heart_rate q_location Green.View.Mean Footprint.Mean
## Min. : 43.50 Indoor - Class : 267 Min. :0.0000 Min. : 0
## 1st Qu.: 72.17 Indoor - Home :2039 1st Qu.:0.2670 1st Qu.: 1045
## Median : 79.67 Indoor - Office: 704 Median :0.3225 Median : 1357
## Mean : 81.37 Indoor - Other : 581 Mean :0.3204 Mean : 2367
## 3rd Qu.: 88.00 Outdoor : 517 3rd Qu.:0.3836 3rd Qu.: 1834
## Max. :178.50 Transportation : 271 Max. :0.6560 Max. :64330
## Perimeter.Mean Building.Count Sky.View.Mean Building.View.Mean
## Min. : 0.0 Min. : 0.00 Min. :0.0000 Min. :0.00000
## 1st Qu.: 150.9 1st Qu.: 22.50 1st Qu.:0.1000 1st Qu.:0.07229
## Median : 191.7 Median : 32.00 Median :0.1453 Median :0.10687
## Mean : 219.7 Mean : 41.91 Mean :0.1577 Mean :0.12802
## 3rd Qu.: 251.7 3rd Qu.: 44.00 3rd Qu.:0.2113 3rd Qu.:0.17388
## Max. :1834.8 Max. :395.00 Max. :0.4772 Max. :0.55050
## Road.View.Mean humidity rainfall temperature
## Min. :0.00000 Min. :34.90 Min. : 0.00000 Min. :22.9
## 1st Qu.:0.09087 1st Qu.:66.10 1st Qu.: 0.00000 1st Qu.:27.4
## Median :0.12242 Median :74.40 Median : 0.00000 Median :29.2
## Mean :0.12753 Mean :74.76 Mean : 0.04111 Mean :29.0
## 3rd Qu.:0.15535 3rd Qu.:83.30 3rd Qu.: 0.00000 3rd Qu.:30.6
## Max. :0.37500 Max. :99.50 Max. :11.31200 Max. :35.0
## wind_speed wind_direction q_thermal_preference
## Min. : 0.300 Min. : 1 Cooler :1736
## 1st Qu.: 2.600 1st Qu.: 73 No change:2377
## Median : 4.200 Median :196 Warmer : 266
## Mean : 4.795 Mean :185
## 3rd Qu.: 6.200 3rd Qu.:286
## Max. :17.400 Max. :359
## date_time dT Visual.Complexity.Mean
## Min. :2022-10-10 13:13:19.00 Min. : 55.00 Min. :0.000
## 1st Qu.:2022-11-16 15:07:59.00 1st Qu.: 79.59 1st Qu.:1.601
## Median :2023-03-29 17:45:24.00 Median : 121.10 Median :1.712
## Mean :2023-02-08 08:38:30.94 Mean : 368.76 Mean :1.694
## 3rd Qu.:2023-04-24 18:08:26.50 3rd Qu.: 275.25 3rd Qu.:1.793
## Max. :2023-05-30 17:00:31.00 Max. :34368.42 Max. :2.126
selected_data %>% plot_histogram(ggtheme = theme_minimal())
selected_data_no_outliers <- selected_data[(
selected_data$dT < 500 & # remove measurements that are infrequent
selected_data$Footprint.Mean < 10000
),]
#selected_data_no_outliers <- selected_data_no_outliers[selected_data_no_outliers$q_location == "Outdoor", ]
selected_data_no_outliers %>% plot_histogram(ggtheme = theme_minimal())
ggplot(data=selected_data_no_outliers,aes(q_thermal_preference)) +
geom_bar(aes(fill=as.factor(round(temperature)))) + theme_minimal() +
scale_color_gradient2(low = "blue", mid = "white", high = "red", space = "Lab" )
mapboxToken <- paste(readLines("mapbox_token.txt"), collapse="")
## Warning in readLines("mapbox_token.txt"): incomplete final line found on
## 'mapbox_token.txt'
# creating a sample data.frame with your lat/lon points
lon <- selected_data_no_outliers$ws_longitude
lat <- selected_data_no_outliers$ws_latitude
thermal_preference <- selected_data_no_outliers$q_thermal_preference
df <- as.data.frame(cbind(lon,lat))
df <- df %>%
arrange(thermal_preference) %>%
mutate(thermal_preference = as.factor(thermal_preference),
color = recode(thermal_preference,'Cooler' = "#fe4a49",
"No change" = "#fed766", "Warmer" = "#009fb7"))
fig <- df
fig <- fig %>%
plot_ly(
lat = ~lat,
lon = ~lon,
type = 'scattermapbox',
mode = "markers",
color = ~thermal_preference,
legendgroup = ~thermal_preference,
marker = list(size=7))
fig <- fig %>%
layout(
mapbox = list(
style = 'light',
zoom =10,
center = list(lon = mean(df$lon), lat = mean(df$lat))))
fig <- fig %>%
config(mapboxAccessToken = mapboxToken)
pb <- plotly_build(fig)
pb
ggplot(selected_data_no_outliers, aes(date_time, temperature)) + geom_point() + theme_minimal()
as.numeric(survey_data$date_time) %>%
plot_histogram(ggtheme = theme_minimal())
# Temperature increases during the day
ggplot(selected_data[ selected_data$date_time > ymd("2022/10/18") &
selected_data$date_time < ymd("2022/10/19") #&
,], aes(date_time, temperature)) + geom_point() + theme_minimal()
========================Model Testing Starts
Here===============================
selected_data_log <-
subset(
selected_data_no_outliers,
select = -c(
id_participant,
ws_longitude,
ws_latitude,
Perimeter.Mean,
humidity,
wind_direction,
Building.Count,
dT
)
)
selected_data_log$q_thermal_preference <-
as.factor(selected_data_log$q_thermal_preference == "Cooler")
selected_data_log$is_outdoor <-
as.factor(selected_data_log$q_location == "Outdoor")
#
selected_data_log$is_winter <- as.factor(selected_data_log$date_time > ym("2023/04"))
selected_data_log$is_day <- as.factor((hour(selected_data_log$date_time) > 12 &
hour(selected_data_log$date_time) < 18) == T)
selected_data_log <-
subset(selected_data_log, select = -c(q_location, date_time))
set.seed(1)
# Divide the data into 80% training and 20% testing
train <-
sample(1:nrow(selected_data_log),
size = round(nrow(selected_data_log) * 0.8),
replace = FALSE)
selected_data_log_train <- selected_data_log[train, ]
selected_data_log_test <- selected_data_log[-train, ]
selected_data_log %>%
mutate(is_outdoor = as.numeric(is_outdoor)) %>%
mutate(q_thermal_preference = as.numeric(q_thermal_preference)) %>%
mutate(is_winter = as.numeric(is_winter)) %>%
mutate(is_day = as.numeric(is_day)) %>%
cor(use = "pairwise.complete.obs") %>%
corrplot(order = 'alphabet', diag = F)
str(selected_data_log_train)
## Classes 'data.table' and 'data.frame': 2678 obs. of 15 variables:
## $ dist_walked : num 405.4 28.8 237 89.1 980.4 ...
## $ average_heart_rate : num 75 75.3 69 79.7 90 ...
## $ Green.View.Mean : num 0.265 0.405 0.367 0.259 0.276 ...
## $ Footprint.Mean : num 1255 1435 1273 1802 1079 ...
## $ Sky.View.Mean : num 0.106 0.132 0.152 0.191 0.288 ...
## $ Building.View.Mean : num 0.2684 0.0782 0.032 0.1297 0.0589 ...
## $ Road.View.Mean : num 0.1155 0.2062 0.1972 0.2013 0.0868 ...
## $ rainfall : num 0 0 0 0 0 0 0 0 0 0 ...
## $ temperature : num 25.5 29.7 26.4 30.7 29.3 27.9 32.5 30.9 26.4 26 ...
## $ wind_speed : num 1.6 5.7 2.5 4.4 7.2 2 4.2 4.7 4.4 2.2 ...
## $ q_thermal_preference : Factor w/ 2 levels "FALSE","TRUE": 1 2 2 1 1 2 1 1 2 1 ...
## $ Visual.Complexity.Mean: num 1.78 1.78 1.38 1.86 1.81 ...
## $ is_outdoor : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 2 2 1 1 ...
## $ is_winter : Factor w/ 2 levels "FALSE","TRUE": 1 1 2 1 1 2 2 2 1 1 ...
## $ is_day : Factor w/ 2 levels "FALSE","TRUE": 1 2 1 2 1 1 2 2 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
model <- glm(q_thermal_preference ~ . , family = binomial(link = "logit"), data = selected_data_log_train)
summary(model)
##
## Call:
## glm(formula = q_thermal_preference ~ ., family = binomial(link = "logit"),
## data = selected_data_log_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.941e+00 8.657e-01 -5.708 1.15e-08 ***
## dist_walked -2.028e-04 1.211e-04 -1.674 0.09403 .
## average_heart_rate 2.274e-04 2.790e-03 0.081 0.93505
## Green.View.Mean 1.670e+00 5.934e-01 2.814 0.00489 **
## Footprint.Mean -4.790e-06 3.066e-05 -0.156 0.87585
## Sky.View.Mean 2.894e+00 6.642e-01 4.357 1.32e-05 ***
## Building.View.Mean 2.149e+00 7.171e-01 2.997 0.00273 **
## Road.View.Mean 1.055e+00 8.797e-01 1.199 0.23052
## rainfall -2.109e-02 1.612e-01 -0.131 0.89593
## temperature 1.037e-01 2.086e-02 4.971 6.65e-07 ***
## wind_speed 1.133e-02 1.454e-02 0.780 0.43557
## Visual.Complexity.Mean -6.921e-02 2.788e-01 -0.248 0.80392
## is_outdoorTRUE 7.059e-01 1.216e-01 5.807 6.37e-09 ***
## is_winterTRUE 3.599e-01 8.951e-02 4.021 5.80e-05 ***
## is_dayTRUE -1.947e-01 8.453e-02 -2.303 0.02125 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3578.7 on 2677 degrees of freedom
## Residual deviance: 3452.2 on 2663 degrees of freedom
## AIC: 3482.2
##
## Number of Fisher Scoring iterations: 4
model2 <-
glm(
q_thermal_preference ~ is_winter + is_outdoor + is_day + temperature + Sky.View.Mean,
family = binomial(link = "logit"),
data = selected_data_log_train
)
summary(model2)
##
## Call:
## glm(formula = q_thermal_preference ~ is_winter + is_outdoor +
## is_day + temperature + Sky.View.Mean, family = binomial(link = "logit"),
## data = selected_data_log_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.88353 0.57885 -6.709 1.96e-11 ***
## is_winterTRUE 0.35545 0.08829 4.026 5.67e-05 ***
## is_outdoorTRUE 0.68305 0.12022 5.682 1.33e-08 ***
## is_dayTRUE -0.19882 0.08324 -2.389 0.01691 *
## temperature 0.10306 0.02025 5.090 3.58e-07 ***
## Sky.View.Mean 1.59007 0.51023 3.116 0.00183 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3578.7 on 2677 degrees of freedom
## Residual deviance: 3466.4 on 2672 degrees of freedom
## AIC: 3478.4
##
## Number of Fisher Scoring iterations: 4
# Step 3: Predict probabilities
probabilities <- predict(model2, selected_data_log_test, type = "response")
# Step 4 and 5: Use different cutoffs and calculate accuracy
cutoffs <- c(0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
accuracies <- sapply(cutoffs, function(cutoff) {
# Convert probabilities to binary predictions
predictions <- ifelse(probabilities > cutoff, 2, 1)
# Calculate accuracy
accuracy(as.numeric(selected_data_log_test$q_thermal_preference), predictions)
#mean(predictions == as.numeric(selected_data_log_test$q_thermal_preference))
})
# Print the accuracies for each cutoff
names(accuracies) <- cutoffs
accuracies
## 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9
## 0.4059701 0.4701493 0.6029851 0.6238806 0.6208955 0.6014925 0.6000000 0.6000000
pR2(model2)['McFadden']
## fitting null model for pseudo-r2
## McFadden
## 0.03139527
fit.tree = rpart(q_thermal_preference ~ ., data=selected_data_log_train, method="class", cp=0.008)
prp(fit.tree,
main = "Tree model for predicting if thermal preference is \"Cooler\"",
box.palette = "auto",
fallen.leaves = F,
shadow.col = "gray",
branch.lty = 3,
branch = .5,
faclen = 0,
round = 0)
printcp(fit.tree)
##
## Classification tree:
## rpart(formula = q_thermal_preference ~ ., data = selected_data_log_train,
## method = "class", cp = 0.008)
##
## Variables actually used in tree construction:
## [1] Building.View.Mean is_outdoor is_winter Sky.View.Mean
## [5] temperature
##
## Root node error: 1041/2678 = 0.38872
##
## n= 2678
##
## CP nsplit rel error xerror xstd
## 1 0.023055 0 1.00000 1.00000 0.024232
## 2 0.010567 5 0.86647 0.94524 0.023966
## 3 0.008000 6 0.85591 0.93084 0.023888
bestcp <- fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
pruned.tree <- prune(fit.tree, cp = bestcp)
prp(pruned.tree,
main = "Tree model for predicting if thermal preference is \"Cooler\"",
box.palette = "auto",
fallen.leaves = F,
shadow.col = "gray",
branch.lty = 3,
branch = .5,
faclen = 0,
round = 0)
predicted <- as.numeric(predict(pruned.tree, selected_data_log_test, type = "class"))
sum(predicted == as.numeric(selected_data_log_test$q_thermal_preference)) / nrow(selected_data_log_test)
## [1] 0.6641791
Metrics::accuracy(as.numeric(selected_data_log_test$q_thermal_preference), predicted)
## [1] 0.6641791
SS_tot <- sum((as.numeric(selected_data_log_train$q_thermal_preference) - mean(as.numeric(selected_data_log_train$q_thermal_preference))) ^ 2)
SS_res_tree <- sum((as.numeric(selected_data_log_train$q_thermal_preference) - as.numeric(predict(pruned.tree, selected_data_log_train, type = "class"))) ^ 2)
R_sq_lm <- 1 - SS_res_tree / SS_tot
R_sq_lm
## [1] -0.4001961
set.seed(50)
model_forest <-
randomForest(
q_thermal_preference ~ . ,
data = selected_data_log_train,
importance = TRUE,
ntree = 150
)
predicted <- as.numeric(predict(model_forest, selected_data_log_test))
sum(predicted == as.numeric(selected_data_log_test$q_thermal_preference)) / nrow(selected_data_log_test)
## [1] 0.7059701
Metrics::accuracy(as.numeric(selected_data_log_test$q_thermal_preference), predicted)
## [1] 0.7059701
randomForest::importance(model_forest)
## FALSE TRUE MeanDecreaseAccuracy
## dist_walked 0.8563182 1.37172663 1.4746404
## average_heart_rate -0.8833980 0.05711924 -0.6839402
## Green.View.Mean 12.0010150 9.75821062 16.8150583
## Footprint.Mean 13.6651554 9.88235906 17.5411607
## Sky.View.Mean 13.7253629 12.71720495 19.0889554
## Building.View.Mean 12.2702192 10.35629188 18.4378167
## Road.View.Mean 13.4612004 13.76639433 18.6040803
## rainfall 3.7616951 0.46017916 3.8177337
## temperature 9.3007950 5.27828073 10.9330602
## wind_speed 5.9826921 0.98871531 5.3589595
## Visual.Complexity.Mean 13.9860610 12.11331136 16.6666900
## is_outdoor 9.1658534 13.02425758 14.0035736
## is_winter 11.5291223 10.40958961 13.1356087
## is_day 0.1084446 1.07094103 0.7524310
## MeanDecreaseGini
## dist_walked 124.84328
## average_heart_rate 116.39439
## Green.View.Mean 103.87575
## Footprint.Mean 129.76265
## Sky.View.Mean 120.27174
## Building.View.Mean 108.88783
## Road.View.Mean 110.00430
## rainfall 11.21246
## temperature 127.13448
## wind_speed 113.41158
## Visual.Complexity.Mean 120.65905
## is_outdoor 25.65583
## is_winter 21.58707
## is_day 20.35907
SS_tot <- sum((as.numeric(selected_data_log_train$q_thermal_preference) - mean(as.numeric(selected_data_log_train$q_thermal_preference))) ^ 2)
SS_res_tree <- sum((as.numeric(selected_data_log_train$q_thermal_preference) - as.numeric(predict(model_forest, selected_data_log_train))) ^ 2)
R_sq_lm <- 1 - SS_res_tree / SS_tot
R_sq_lm
## [1] 0.9984285
selected_data_multinom <-
subset(
selected_data_no_outliers,
select = -c(
id_participant,
ws_longitude,
ws_latitude,
Perimeter.Mean,
wind_direction,
Building.Count,
dT
)
)
selected_data_multinom$q_thermal_preference <-
as.factor(selected_data_multinom$q_thermal_preference)
selected_data_multinom$is_outdoor <-
as.factor(selected_data_multinom$q_location == "Outdoor")
selected_data_multinom$is_winter <- as.factor(selected_data_multinom$date_time > ym("2023/04"))
selected_data_multinom$is_day <- as.factor((hour(selected_data_multinom$date_time) > 12 &
hour(selected_data_multinom$date_time) < 18) == T)
selected_data_multinom <-
subset(selected_data_multinom, select = -c(q_location, date_time))
set.seed(2)
# Divide the data into 80% training and 20% testing
train <-
sample(1:nrow(selected_data_multinom),
size = round(nrow(selected_data_multinom) * 0.8),
replace = FALSE)
selected_data_multinom_train <- selected_data_multinom[train, ]
selected_data_multinom_test <- selected_data_multinom[-train, ]
selected_data_multinom %>%
mutate(is_outdoor = as.numeric(is_outdoor)) %>%
mutate(q_thermal_preference = as.numeric(q_thermal_preference)) %>%
mutate(is_winter = as.numeric(is_winter)) %>%
mutate(is_day = as.numeric(is_day)) %>%
cor(use = "pairwise.complete.obs") %>%
corrplot(order = 'alphabet', diag = F)
model_multinom <- multinom(q_thermal_preference ~ ., data = selected_data_multinom_train)
## # weights: 51 (32 variable)
## initial value 2942.083709
## iter 10 value 2621.325508
## iter 20 value 2333.878059
## iter 30 value 2262.114329
## iter 40 value 2261.403953
## final value 2261.403665
## converged
summary(model_multinom)
## Call:
## multinom(formula = q_thermal_preference ~ ., data = selected_data_multinom_train)
##
## Coefficients:
## (Intercept) dist_walked average_heart_rate Green.View.Mean
## No change 3.7752521 0.0002199776 -0.001969002 -1.199221
## Warmer 0.4628009 -0.0006249347 0.002440529 -1.283316
## Footprint.Mean Sky.View.Mean Building.View.Mean Road.View.Mean
## No change 2.388761e-06 -2.831785 -2.2248894 -1.000906
## Warmer 1.008029e-04 -1.112459 0.8609189 2.105141
## humidity rainfall temperature wind_speed
## No change 0.004302523 0.002536116 -0.09158742 -0.003618452
## Warmer -0.001101559 0.122157617 -0.02482563 0.048528087
## Visual.Complexity.Mean is_outdoorTRUE is_winterTRUE is_dayTRUE
## No change 0.2782135 -0.6701643 -0.3502048 0.1534436
## Warmer -0.9386555 -0.4311649 -0.4710044 0.3879659
##
## Std. Errors:
## (Intercept) dist_walked average_heart_rate Green.View.Mean
## No change 0.0014990947 0.0001231608 0.002853019 0.0012844538
## Warmer 0.0005027521 0.0002912159 0.005606995 0.0003877144
## Footprint.Mean Sky.View.Mean Building.View.Mean Road.View.Mean
## No change 3.176422e-05 0.0023649486 0.0008636186 0.0006949420
## Warmer 5.492044e-05 0.0004844795 0.0003588285 0.0001798301
## humidity rainfall temperature wind_speed Visual.Complexity.Mean
## No change 0.002684677 0.10679755 0.009805298 0.01436564 0.004708234
## Warmer 0.005160460 0.04063834 0.017780836 0.02636042 0.001055362
## is_outdoorTRUE is_winterTRUE is_dayTRUE
## No change 0.11756837 0.08220748 0.07980995
## Warmer 0.01780133 0.01578459 0.01499350
##
## Residual Deviance: 4522.807
## AIC: 4586.807
tidy(model_multinom, conf.int = TRUE) %>%
kable() %>%
kable_styling("basic", full_width = FALSE)
| y.level | term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|---|
| No change | (Intercept) | 3.7752521 | 0.0014991 | 2518.3546648 | 0.0000000 | 3.7723139 | 3.7781903 |
| No change | dist_walked | 0.0002200 | 0.0001232 | 1.7861005 | 0.0740830 | -0.0000214 | 0.0004614 |
| No change | average_heart_rate | -0.0019690 | 0.0028530 | -0.6901468 | 0.4901019 | -0.0075608 | 0.0036228 |
| No change | Green.View.Mean | -1.1992206 | 0.0012845 | -933.6424328 | 0.0000000 | -1.2017381 | -1.1967031 |
| No change | Footprint.Mean | 0.0000024 | 0.0000318 | 0.0752029 | 0.9400533 | -0.0000599 | 0.0000646 |
| No change | Sky.View.Mean | -2.8317855 | 0.0023649 | -1197.3983070 | 0.0000000 | -2.8364207 | -2.8271503 |
| No change | Building.View.Mean | -2.2248894 | 0.0008636 | -2576.2409104 | 0.0000000 | -2.2265821 | -2.2231968 |
| No change | Road.View.Mean | -1.0009062 | 0.0006949 | -1440.2730335 | 0.0000000 | -1.0022682 | -0.9995441 |
| No change | humidity | 0.0043025 | 0.0026847 | 1.6026222 | 0.1090181 | -0.0009593 | 0.0095644 |
| No change | rainfall | 0.0025361 | 0.1067976 | 0.0237469 | 0.9810545 | -0.2067832 | 0.2118555 |
| No change | temperature | -0.0915874 | 0.0098053 | -9.3406054 | 0.0000000 | -0.1108055 | -0.0723694 |
| No change | wind_speed | -0.0036185 | 0.0143656 | -0.2518825 | 0.8011319 | -0.0317746 | 0.0245377 |
| No change | Visual.Complexity.Mean | 0.2782135 | 0.0047082 | 59.0908400 | 0.0000000 | 0.2689856 | 0.2874415 |
| No change | is_outdoorTRUE | -0.6701643 | 0.1175684 | -5.7002093 | 0.0000000 | -0.9005941 | -0.4397345 |
| No change | is_winterTRUE | -0.3502048 | 0.0822075 | -4.2600111 | 0.0000204 | -0.5113285 | -0.1890811 |
| No change | is_dayTRUE | 0.1534436 | 0.0798100 | 1.9226122 | 0.0545288 | -0.0029810 | 0.3098682 |
| Warmer | (Intercept) | 0.4628009 | 0.0005028 | 920.5351224 | 0.0000000 | 0.4618156 | 0.4637863 |
| Warmer | dist_walked | -0.0006249 | 0.0002912 | -2.1459497 | 0.0318770 | -0.0011957 | -0.0000542 |
| Warmer | average_heart_rate | 0.0024405 | 0.0056070 | 0.4352651 | 0.6633700 | -0.0085490 | 0.0134300 |
| Warmer | Green.View.Mean | -1.2833160 | 0.0003877 | -3309.9517812 | 0.0000000 | -1.2840759 | -1.2825561 |
| Warmer | Footprint.Mean | 0.0001008 | 0.0000549 | 1.8354347 | 0.0664413 | -0.0000068 | 0.0002084 |
| Warmer | Sky.View.Mean | -1.1124589 | 0.0004845 | -2296.1938892 | 0.0000000 | -1.1134085 | -1.1115094 |
| Warmer | Building.View.Mean | 0.8609189 | 0.0003588 | 2399.2487940 | 0.0000000 | 0.8602156 | 0.8616222 |
| Warmer | Road.View.Mean | 2.1051415 | 0.0001798 | 11706.2797585 | 0.0000000 | 2.1047890 | 2.1054939 |
| Warmer | humidity | -0.0011016 | 0.0051605 | -0.2134615 | 0.8309670 | -0.0112159 | 0.0090128 |
| Warmer | rainfall | 0.1221576 | 0.0406383 | 3.0059696 | 0.0026474 | 0.0425079 | 0.2018073 |
| Warmer | temperature | -0.0248256 | 0.0177808 | -1.3962017 | 0.1626538 | -0.0596754 | 0.0100242 |
| Warmer | wind_speed | 0.0485281 | 0.0263604 | 1.8409453 | 0.0656296 | -0.0031374 | 0.1001936 |
| Warmer | Visual.Complexity.Mean | -0.9386555 | 0.0010554 | -889.4156712 | 0.0000000 | -0.9407240 | -0.9365870 |
| Warmer | is_outdoorTRUE | -0.4311649 | 0.0178013 | -24.2209411 | 0.0000000 | -0.4660549 | -0.3962749 |
| Warmer | is_winterTRUE | -0.4710044 | 0.0157846 | -29.8395155 | 0.0000000 | -0.5019416 | -0.4400672 |
| Warmer | is_dayTRUE | 0.3879659 | 0.0149935 | 25.8756094 | 0.0000000 | 0.3585792 | 0.4173527 |
model_multinom2 <- multinom(q_thermal_preference ~ is_outdoor + is_winter + temperature + humidity + Green.View.Mean + Sky.View.Mean + Building.View.Mean + Road.View.Mean, data = selected_data_multinom_train)
## # weights: 30 (18 variable)
## initial value 2942.083709
## iter 10 value 2339.496857
## iter 20 value 2276.639460
## final value 2276.637693
## converged
summary(model_multinom)
## Call:
## multinom(formula = q_thermal_preference ~ ., data = selected_data_multinom_train)
##
## Coefficients:
## (Intercept) dist_walked average_heart_rate Green.View.Mean
## No change 3.7752521 0.0002199776 -0.001969002 -1.199221
## Warmer 0.4628009 -0.0006249347 0.002440529 -1.283316
## Footprint.Mean Sky.View.Mean Building.View.Mean Road.View.Mean
## No change 2.388761e-06 -2.831785 -2.2248894 -1.000906
## Warmer 1.008029e-04 -1.112459 0.8609189 2.105141
## humidity rainfall temperature wind_speed
## No change 0.004302523 0.002536116 -0.09158742 -0.003618452
## Warmer -0.001101559 0.122157617 -0.02482563 0.048528087
## Visual.Complexity.Mean is_outdoorTRUE is_winterTRUE is_dayTRUE
## No change 0.2782135 -0.6701643 -0.3502048 0.1534436
## Warmer -0.9386555 -0.4311649 -0.4710044 0.3879659
##
## Std. Errors:
## (Intercept) dist_walked average_heart_rate Green.View.Mean
## No change 0.0014990947 0.0001231608 0.002853019 0.0012844538
## Warmer 0.0005027521 0.0002912159 0.005606995 0.0003877144
## Footprint.Mean Sky.View.Mean Building.View.Mean Road.View.Mean
## No change 3.176422e-05 0.0023649486 0.0008636186 0.0006949420
## Warmer 5.492044e-05 0.0004844795 0.0003588285 0.0001798301
## humidity rainfall temperature wind_speed Visual.Complexity.Mean
## No change 0.002684677 0.10679755 0.009805298 0.01436564 0.004708234
## Warmer 0.005160460 0.04063834 0.017780836 0.02636042 0.001055362
## is_outdoorTRUE is_winterTRUE is_dayTRUE
## No change 0.11756837 0.08220748 0.07980995
## Warmer 0.01780133 0.01578459 0.01499350
##
## Residual Deviance: 4522.807
## AIC: 4586.807
tidy(model_multinom2, conf.int = TRUE) %>%
kable() %>%
kable_styling("basic", full_width = FALSE)
| y.level | term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|---|
| No change | (Intercept) | 4.1357486 | 1.7413921 | 2.3749669 | 0.0175505 | 0.7226828 | 7.5488145 |
| No change | is_outdoorTRUE | -0.6759504 | 0.1230572 | -5.4929750 | 0.0000000 | -0.9171381 | -0.4347626 |
| No change | is_winterTRUE | -0.3716983 | 0.0916106 | -4.0573721 | 0.0000496 | -0.5512517 | -0.1921448 |
| No change | temperature | -0.0870918 | 0.0406158 | -2.1442827 | 0.0320102 | -0.1666974 | -0.0074863 |
| No change | humidity | 0.0038375 | 0.0072002 | 0.5329792 | 0.5940480 | -0.0102745 | 0.0179496 |
| No change | Green.View.Mean | -1.3402841 | 0.5842772 | -2.2939181 | 0.0217952 | -2.4854464 | -0.1951218 |
| No change | Sky.View.Mean | -2.7604770 | 0.6583995 | -4.1927079 | 0.0000276 | -4.0509164 | -1.4700376 |
| No change | Building.View.Mean | -2.1534922 | 0.7238279 | -2.9751438 | 0.0029285 | -3.5721689 | -0.7348155 |
| No change | Road.View.Mean | -0.7749980 | 0.9030069 | -0.8582415 | 0.3907591 | -2.5448589 | 0.9948630 |
| Warmer | (Intercept) | -2.1556697 | 3.3693787 | -0.6397825 | 0.5223140 | -8.7595307 | 4.4481913 |
| Warmer | is_outdoorTRUE | -0.4533492 | 0.2486565 | -1.8231946 | 0.0682739 | -0.9407070 | 0.0340086 |
| Warmer | is_winterTRUE | -0.4751693 | 0.1821329 | -2.6089147 | 0.0090830 | -0.8321433 | -0.1181953 |
| Warmer | temperature | 0.0289362 | 0.0789534 | 0.3664977 | 0.7139937 | -0.1258096 | 0.1836820 |
| Warmer | humidity | 0.0057392 | 0.0138387 | 0.4147213 | 0.6783459 | -0.0213841 | 0.0328626 |
| Warmer | Green.View.Mean | -1.3286374 | 1.1668583 | -1.1386451 | 0.2548512 | -3.6156377 | 0.9583628 |
| Warmer | Sky.View.Mean | -2.3908784 | 1.3161961 | -1.8165062 | 0.0692928 | -4.9705754 | 0.1888186 |
| Warmer | Building.View.Mean | -0.1337018 | 1.3852025 | -0.0965215 | 0.9231064 | -2.8486488 | 2.5812451 |
| Warmer | Road.View.Mean | 1.7403543 | 1.7774074 | 0.9791533 | 0.3275042 | -1.7433001 | 5.2240087 |
predicted <-
predict(model_multinom2, selected_data_multinom_test, type="class")
sum(predicted == selected_data_multinom_test$q_thermal_preference) / nrow(selected_data_multinom_test)
## [1] 0.561194
pR2(model_multinom2)['McFadden']
## fitting null model for pseudo-r2
## # weights: 6 (2 variable)
## initial value 2942.083709
## final value 2338.881449
## converged
## McFadden
## 0.02661262
Metrics::accuracy(selected_data_multinom_test$q_thermal_preference, predict(model_multinom2, selected_data_multinom_test, type="class"))
## [1] 0.561194
table(predict(model_multinom2, selected_data_multinom_test, type = "class"))
##
## Cooler No change Warmer
## 157 513 0
fit.tree_multinom = rpart(q_thermal_preference ~ ., data=selected_data_multinom_train, method="class", cp=0.008)
prp(fit.tree_multinom,
main = "Tree model for predicting actual thermal preference",
box.palette = "auto",
fallen.leaves = F,
shadow.col = "gray",
branch.lty = 3,
branch = .5,
faclen = 0,
round = 0)
fit.tree_multinom$variable.importance
## temperature humidity Sky.View.Mean
## 26.08085381 24.23874296 14.79659833
## is_outdoor Footprint.Mean Visual.Complexity.Mean
## 13.51492219 11.84085355 4.19756349
## Building.View.Mean Green.View.Mean is_winter
## 3.52949344 3.24815609 3.11076425
## Road.View.Mean wind_speed average_heart_rate
## 0.64676087 0.20953368 0.09875442
bestcp_multinom <- fit.tree_multinom$cptable[which.min(fit.tree_multinom$cptable[,"xerror"]),"CP"]
bestcp_multinom
## [1] 0.008
final_tree_model <- prune(fit.tree_multinom, cp = bestcp_multinom)
final_tree_model$variable.importance
## temperature humidity Sky.View.Mean
## 26.08085381 24.23874296 14.79659833
## is_outdoor Footprint.Mean Visual.Complexity.Mean
## 13.51492219 11.84085355 4.19756349
## Building.View.Mean Green.View.Mean is_winter
## 3.52949344 3.24815609 3.11076425
## Road.View.Mean wind_speed average_heart_rate
## 0.64676087 0.20953368 0.09875442
prp(final_tree_model,
main = "Tree model for predicting actual thermal preference",
box.palette = "auto",
fallen.leaves = F,
shadow.col = "gray",
branch.lty = 3,
branch = .5,
faclen = 0,
round = 0)
final_tree_model <-
rpart(
q_thermal_preference ~
Visual.Complexity.Mean +
Footprint.Mean +
Sky.View.Mean +
Green.View.Mean +
Road.View.Mean +
Sky.View.Mean +
temperature +
humidity +
is_outdoor,
data = selected_data_multinom_train,
method = "class",
cp = 0.008
)
prp(final_tree_model,
main = "Tree model for predicting actual thermal preference",
box.palette = "auto",
fallen.leaves = F,
shadow.col = "gray",
branch.lty = 3,
branch = .5,
faclen = 0,
round = 0)
predicted <- as.numeric(predict(final_tree_model, selected_data_multinom_test, type = "class"))
sum(predicted == as.numeric(selected_data_multinom_test$q_thermal_preference)) / nrow(selected_data_multinom_test)
## [1] 0.5641791
table(predict(final_tree_model, selected_data_multinom_test, type = "class"))
##
## Cooler No change Warmer
## 221 449 0
set.seed(50)
model_forest_multinom <-
randomForest(
q_thermal_preference ~ . -average_heart_rate -dist_walked -rainfall -wind_speed,
data = selected_data_multinom_train,
importance = TRUE,
ntree = 200
)
predicted <- as.numeric(predict(model_forest_multinom, selected_data_multinom_test))
sum(predicted == as.numeric(selected_data_multinom_test$q_thermal_preference)) / nrow(selected_data_multinom_test)
## [1] 0.6537313
randomForest::importance(model_forest_multinom)
## Cooler No change Warmer MeanDecreaseAccuracy
## Green.View.Mean 14.4061414 17.415267 11.719994 23.938412
## Footprint.Mean 17.6342879 15.588526 12.198786 24.695391
## Sky.View.Mean 20.8624277 19.429347 12.203289 31.269214
## Building.View.Mean 18.0361932 18.736024 9.164805 27.435648
## Road.View.Mean 22.3109888 21.242090 12.353233 31.763268
## humidity 8.5017913 13.330698 3.244791 17.787586
## temperature 11.1749083 12.814978 1.724296 17.150709
## Visual.Complexity.Mean 19.8946811 19.867053 13.797653 27.315548
## is_outdoor 12.2169349 8.714548 2.318363 14.959958
## is_winter 13.4873383 14.814294 6.457539 16.964393
## is_day -0.1618654 1.811180 1.614403 1.500995
## MeanDecreaseGini
## Green.View.Mean 140.96915
## Footprint.Mean 181.56286
## Sky.View.Mean 153.64321
## Building.View.Mean 149.70710
## Road.View.Mean 149.59443
## humidity 203.45012
## temperature 186.22944
## Visual.Complexity.Mean 153.70236
## is_outdoor 28.69651
## is_winter 26.44489
## is_day 32.63674
table(predict(model_forest_multinom, selected_data_multinom_test, type = "class"))
##
## Cooler No change Warmer
## 233 426 11
table(selected_data_multinom_test$q_thermal_preference)
##
## Cooler No change Warmer
## 282 346 42
# if we predict "No Change" we will be correct 51% of the time
sum(selected_data_multinom_test$q_thermal_preference == "No change") / nrow(selected_data_multinom_test)
## [1] 0.5164179
========================Model Testing Ends Here=================================